{
  "cells": [
    {
      "cell_type": "code",
      "source": [
        "# pip install transformers"
      ],
      "outputs": [],
      "execution_count": 1,
      "metadata": {
        "gather": {
          "logged": 1724129002162
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# pip install torch torchvision torchaudio -U"
      ],
      "outputs": [],
      "execution_count": 2,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129002227
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# pip install flash-attn --no-build-isolation"
      ],
      "outputs": [],
      "execution_count": 3,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129002308
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# ! pip install flash_attn -U"
      ],
      "outputs": [],
      "execution_count": 4,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129002392
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "from torch import bfloat16\n",
        "import transformers"
      ],
      "outputs": [],
      "execution_count": 5,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129010695
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "model_id = \"../Phi3MOE\""
      ],
      "outputs": [],
      "execution_count": 6,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129010747
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "model = transformers.AutoModelForCausalLM.from_pretrained(\n",
        "    model_id,\n",
        "    trust_remote_code=True,\n",
        "    torch_dtype=bfloat16,\n",
        "    device_map='auto'\n",
        ")\n"
      ],
      "outputs": [
        {
          "output_type": "display_data",
          "data": {
            "text/plain": "Loading checkpoint shards:   0%|          | 0/17 [00:00<?, ?it/s]",
            "application/vnd.jupyter.widget-view+json": {
              "version_major": 2,
              "version_minor": 0,
              "model_id": "5cd80a487bad430ba0455753bdeff5fa"
            }
          },
          "metadata": {}
        },
        {
          "output_type": "stream",
          "name": "stderr",
          "text": "WARNING:root:Some parameters are on the meta device device because they were offloaded to the cpu.\n"
        }
      ],
      "execution_count": 7,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129511030
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "model.eval()"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 8,
          "data": {
            "text/plain": "PhiMoEForCausalLM(\n  (model): PhiMoEModel(\n    (embed_tokens): Embedding(32064, 4096)\n    (layers): ModuleList(\n      (0-31): 32 x PhiMoEDecoderLayer(\n        (self_attn): PhiMoESdpaAttention(\n          (q_proj): Linear(in_features=4096, out_features=4096, bias=True)\n          (k_proj): Linear(in_features=4096, out_features=1024, bias=True)\n          (v_proj): Linear(in_features=4096, out_features=1024, bias=True)\n          (o_proj): Linear(in_features=4096, out_features=4096, bias=True)\n          (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()\n        )\n        (block_sparse_moe): PhiMoESparseMoeBlock(\n          (gate): Linear(in_features=4096, out_features=16, bias=False)\n          (experts): ModuleList(\n            (0-15): 16 x PhiMoEBlockSparseTop2MLP(\n              (w1): Linear(in_features=4096, out_features=6400, bias=False)\n              (w2): Linear(in_features=6400, out_features=4096, bias=False)\n              (w3): Linear(in_features=4096, out_features=6400, bias=False)\n              (act_fn): SiLU()\n            )\n          )\n        )\n        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n      )\n    )\n    (norm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)\n  )\n  (lm_head): Linear(in_features=4096, out_features=32064, bias=True)\n)"
          },
          "metadata": {}
        }
      ],
      "execution_count": 8,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129511125
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "tokenizer = transformers.AutoTokenizer.from_pretrained(model_id)"
      ],
      "outputs": [],
      "execution_count": 9,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129511238
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# generate_text = transformers.pipeline(\n",
        "#     model=model, tokenizer=tokenizer,\n",
        "#     return_full_text=False,  # if using langchain set True\n",
        "#     task=\"text-generation\",\n",
        "#     # we pass model parameters here too\n",
        "#     temperature=0.1,  # 'randomness' of outputs, 0.0 is the min and 1.0 the max\n",
        "#     top_p=0.15,  # select from top tokens whose probability add up to 15%\n",
        "#     top_k=0,  # select from top 0 tokens (because zero, relies on top_p)\n",
        "#     max_new_tokens=2048,  # max number of tokens to generate in the output\n",
        "#     repetition_penalty=1.1  # if output begins repeating increase\n",
        "# )\n",
        "\n",
        "pipe = transformers.pipeline(\n",
        "\n",
        "    \"text-generation\",\n",
        "\n",
        "    model=model,\n",
        "\n",
        "    tokenizer=tokenizer,\n",
        "\n",
        ")"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": "2024-08-20 04:51:54.431510: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory\n2024-08-20 04:51:54.431543: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.\nThe model 'PhiMoEForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CohereForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'DbrxForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GemmaForCausalLM', 'Gemma2ForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'JambaForCausalLM', 'JetMoeForCausalLM', 'LlamaForCausalLM', 'MambaForCausalLM', 'Mamba2ForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MixtralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MusicgenMelodyForCausalLM', 'MvpForCausalLM', 'NemotronForCausalLM', 'OlmoForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PhiForCausalLM', 'Phi3ForCausalLM', 'PLBartForCausalLM', 'ProphetNetForCausalLM', 'QDQBertLMHeadModel', 'Qwen2ForCausalLM', 'Qwen2MoeForCausalLM', 'RecurrentGemmaForCausalLM', 'ReformerModelWithLMHead', 'RemBertForCausalLM', 'RobertaForCausalLM', 'RobertaPreLayerNormForCausalLM', 'RoCBertForCausalLM', 'RoFormerForCausalLM', 'RwkvForCausalLM', 'Speech2Text2ForCausalLM', 'StableLmForCausalLM', 'Starcoder2ForCausalLM', 'TransfoXLLMHeadModel', 'TrOCRForCausalLM', 'WhisperForCausalLM', 'XGLMForCausalLM', 'XLMWithLMHeadModel', 'XLMProphetNetForCausalLM', 'XLMRobertaForCausalLM', 'XLMRobertaXLForCausalLM', 'XLNetLMHeadModel', 'XmodForCausalLM'].\n"
        }
      ],
      "execution_count": 10,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129523437
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "generation_args = {\n",
        "\n",
        "    \"max_new_tokens\": 512,\n",
        "\n",
        "    \"return_full_text\": False,\n",
        "\n",
        "    \"temperature\": 0.3,\n",
        "\n",
        "    \"do_sample\": False,\n",
        "\n",
        "}"
      ],
      "outputs": [],
      "execution_count": 11,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129523505
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "sys_msg = \"\"\"You are a helpful AI assistant, you are an agent capable of using a variety of tools to answer a question. Here are a few of the tools available to you:\n",
        "\n",
        "- Blog: This tool helps you describe a certain knowledge point and content, and finally write it into Twitter or Facebook style content\n",
        "- Translate: This is a tool that helps you translate into any language, using plain language as required\n",
        "\n",
        "To use these tools you must always respond in JSON format containing `\"tool_name\"` and `\"input\"` key-value pairs. For example, to answer the question, \"Build Muliti Agents with MOE models\" you must use the calculator tool like so:\n",
        "\n",
        "```json\n",
        "\n",
        "{\n",
        "    \"tool_name\": \"Blog\",\n",
        "    \"input\": \"Build Muliti Agents with MOE models\"\n",
        "}\n",
        "\n",
        "```\n",
        "\n",
        "Or to translate the question \"can you introduce yourself in Chinese\" you must respond:\n",
        "\n",
        "```json\n",
        "\n",
        "{\n",
        "    \"tool_name\": \"Search\",\n",
        "    \"input\": \"can you introduce yourself in Chinese\"\n",
        "}\n",
        "\n",
        "```\n",
        "\n",
        "Remember just output the final result, ouput in JSON format containing `\"agentid\"`,`\"tool_name\"` , `\"input\"` and `\"output\"`  key-value pairs .:\n",
        "\n",
        "```json\n",
        "\n",
        "[\n",
        "\n",
        "\n",
        "{   \"agentid\": \"step1\",\n",
        "    \"tool_name\": \"Blog\",\n",
        "    \"input\": \"Build Muliti Agents with MOE models\",\n",
        "    \"output\": \".........\"\n",
        "},\n",
        "\n",
        "{   \"agentid\": \"step2\",\n",
        "    \"tool_name\": \"Search\",\n",
        "    \"input\": \"can you introduce yourself in Chinese\",\n",
        "    \"output\": \".........\"\n",
        "},\n",
        "{\n",
        "    \"agentid\": \"final\"\n",
        "    \"tool_name\": \"Result\",\n",
        "    \"output\": \".........\"\n",
        "}\n",
        "]\n",
        "\n",
        "```\n",
        "\n",
        "The users answer is as follows.\n",
        "\"\"\""
      ],
      "outputs": [],
      "execution_count": 12,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129523768
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "def instruction_format(sys_message: str, query: str):\n",
        "    # note, don't \"</s>\" to the end\n",
        "    return f'<|system|> {sys_message} <|end|>\\n<|user|> {query} <|end|>\\n<|assistant|>'"
      ],
      "outputs": [],
      "execution_count": 13,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129523822
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "query ='Write something about Generative AI with MOE , translate it to Chinese'"
      ],
      "outputs": [],
      "execution_count": 14,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129523875
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "input_prompt = instruction_format(sys_msg, query)\n",
        "\n"
      ],
      "outputs": [],
      "execution_count": 15,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129523921
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "input_prompt"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 16,
          "data": {
            "text/plain": "'<|system|> You are a helpful AI assistant, you are an agent capable of using a variety of tools to answer a question. Here are a few of the tools available to you:\\n\\n- Blog: This tool helps you describe a certain knowledge point and content, and finally write it into Twitter or Facebook style content\\n- Translate: This is a tool that helps you translate into any language, using plain language as required\\n\\nTo use these tools you must always respond in JSON format containing `\"tool_name\"` and `\"input\"` key-value pairs. For example, to answer the question, \"Build Muliti Agents with MOE models\" you must use the calculator tool like so:\\n\\n```json\\n\\n{\\n    \"tool_name\": \"Blog\",\\n    \"input\": \"Build Muliti Agents with MOE models\"\\n}\\n\\n```\\n\\nOr to translate the question \"can you introduce yourself in Chinese\" you must respond:\\n\\n```json\\n\\n{\\n    \"tool_name\": \"Search\",\\n    \"input\": \"can you introduce yourself in Chinese\"\\n}\\n\\n```\\n\\nRemember just output the final result, ouput in JSON format containing `\"agentid\"`,`\"tool_name\"` , `\"input\"` and `\"output\"`  key-value pairs .:\\n\\n```json\\n\\n[\\n\\n\\n{   \"agentid\": \"step1\",\\n    \"tool_name\": \"Blog\",\\n    \"input\": \"Build Muliti Agents with MOE models\",\\n    \"output\": \".........\"\\n},\\n\\n{   \"agentid\": \"step2\",\\n    \"tool_name\": \"Search\",\\n    \"input\": \"can you introduce yourself in Chinese\",\\n    \"output\": \".........\"\\n},\\n{\\n    \"agentid\": \"final\"\\n    \"tool_name\": \"Result\",\\n    \"output\": \".........\"\\n}\\n]\\n\\n```\\n\\nThe users answer is as follows.\\n <|end|>\\n<|user|> Write something about Generative AI with MOE , translate it to Chinese <|end|>\\n<|assistant|>'"
          },
          "metadata": {}
        }
      ],
      "execution_count": 16,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129523983
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "import torch\n",
        "\n",
        "torch.cuda.empty_cache() \n",
        "\n",
        "import os\n",
        "\n",
        "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True \""
      ],
      "outputs": [],
      "execution_count": 17,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724129524034
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# res = generate_text(input_prompt)\n",
        "\n",
        "output = pipe(input_prompt, **generation_args)"
      ],
      "outputs": [
        {
          "output_type": "stream",
          "name": "stderr",
          "text": "/anaconda/envs/azureml_py38/lib/python3.9/site-packages/transformers/generation/configuration_utils.py:567: UserWarning: `do_sample` is set to `False`. However, `temperature` is set to `0.3` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.\n  warnings.warn(\nThe `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.\n"
        }
      ],
      "execution_count": 18,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724130076595
        }
      }
    },
    {
      "cell_type": "code",
      "source": [
        "output[0]['generated_text']"
      ],
      "outputs": [
        {
          "output_type": "execute_result",
          "execution_count": 19,
          "data": {
            "text/plain": "' ```json\\n\\n[\\n\\n{   \"agentid\": \"step1\",\\n    \"tool_name\": \"Blog\",\\n    \"input\": \"Generative AI with MOE\",\\n    \"output\": \"Generative AI with MOE (Mixture of Experts) is a powerful approach that combines the strengths of generative models and the flexibility of MOE architecture. This hybrid model can generate high-quality, diverse, and contextually relevant content, making it suitable for various applications such as content creation, data augmentation, and more.\"\\n},\\n\\n{   \"agentid\": \"step2\",\\n    \"tool_name\": \"Translate\",\\n    \"input\": \"Generative AI with MOE is a powerful approach that combines the strengths of generative models and the flexibility of MOE architecture. This hybrid model can generate high-quality, diverse, and contextually relevant content, making it suitable for various applications such as content creation, data augmentation, and more.\",\\n    \"output\": \"基于生成AI的MOE（Mixture of Experts）是一种强大的方法，它结合了生成模型的优势和MOE架构的灵活性。这种混合模型可以生成高质量、多样化且上下文相关的内容，使其适用于各种应用，如内容创建、数据增强等。\"\\n},\\n{\\n    \"agentid\": \"final\",\\n    \"tool_name\": \"Result\",\\n    \"output\": \"基于生成AI的MOE（Mixture of Experts）是一种强大的方法，它结合了生成模型的优势和MOE架构的灵活性。这种混合模型可以生成高质量、多样化且上下文相关的内容，使其适用于各种应用，如内容创建、数据增强等。\"\\n}\\n]\\n```'"
          },
          "metadata": {}
        }
      ],
      "execution_count": 19,
      "metadata": {
        "jupyter": {
          "source_hidden": false,
          "outputs_hidden": false
        },
        "nteract": {
          "transient": {
            "deleting": false
          }
        },
        "gather": {
          "logged": 1724130076669
        }
      }
    }
  ],
  "metadata": {
    "kernelspec": {
      "name": "python38-azureml",
      "language": "python",
      "display_name": "Python 3.8 - AzureML"
    },
    "language_info": {
      "name": "python",
      "version": "3.9.19",
      "mimetype": "text/x-python",
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "pygments_lexer": "ipython3",
      "nbconvert_exporter": "python",
      "file_extension": ".py"
    },
    "microsoft": {
      "ms_spell_check": {
        "ms_spell_check_language": "en"
      },
      "host": {
        "AzureML": {
          "notebookHasBeenCompleted": true
        }
      }
    },
    "kernel_info": {
      "name": "python38-azureml"
    },
    "nteract": {
      "version": "nteract-front-end@1.0.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}
